On this markdown, data exploration will be done at a high level (and last 10 years only). The parameters of interest that were selected (for now) are: temperature, dissolved oxygen, turbidity and TSS.
The available databases on WQX are NWIS (USGS) and WQX(EPA). WQX is a data warehouse for water quality, biological, and physical data used by state environmental agencies, the EPA, other federal agencies, universities, private citizens, and others.
Note for pulling data: The WQP brings data from all these organizations together and provides it in a single format that has a more verbose output than NWIS. To get non-NWIS data, need to use CharacteristicName instead of parameter code.
Overview of column names: We are using two dataRetrival functions,
whatWQPsites and readWQPdata
## [1] "OrganizationIdentifier"
## [2] "OrganizationFormalName"
## [3] "MonitoringLocationIdentifier"
## [4] "MonitoringLocationName"
## [5] "MonitoringLocationTypeName"
## [6] "MonitoringLocationDescriptionText"
## [7] "HUCEightDigitCode"
## [8] "DrainageAreaMeasure.MeasureValue"
## [9] "DrainageAreaMeasure.MeasureUnitCode"
## [10] "ContributingDrainageAreaMeasure.MeasureValue"
## [11] "ContributingDrainageAreaMeasure.MeasureUnitCode"
## [12] "LatitudeMeasure"
## [13] "LongitudeMeasure"
## [14] "SourceMapScaleNumeric"
## [15] "HorizontalAccuracyMeasure.MeasureValue"
## [16] "HorizontalAccuracyMeasure.MeasureUnitCode"
## [17] "HorizontalCollectionMethodName"
## [18] "HorizontalCoordinateReferenceSystemDatumName"
## [19] "VerticalMeasure.MeasureValue"
## [20] "VerticalMeasure.MeasureUnitCode"
## [21] "VerticalAccuracyMeasure.MeasureValue"
## [22] "VerticalAccuracyMeasure.MeasureUnitCode"
## [23] "VerticalCollectionMethodName"
## [24] "VerticalCoordinateReferenceSystemDatumName"
## [25] "CountryCode"
## [26] "StateCode"
## [27] "CountyCode"
## [28] "AquiferName"
## [29] "LocalAqfrName"
## [30] "FormationTypeText"
## [31] "AquiferTypeName"
## [32] "ConstructionDateText"
## [33] "WellDepthMeasure.MeasureValue"
## [34] "WellDepthMeasure.MeasureUnitCode"
## [35] "WellHoleDepthMeasure.MeasureValue"
## [36] "WellHoleDepthMeasure.MeasureUnitCode"
## [37] "ProviderName"
List of column names:
## [1] "OrganizationIdentifier"
## [2] "OrganizationFormalName"
## [3] "ActivityIdentifier"
## [4] "ActivityTypeCode"
## [5] "ActivityMediaName"
## [6] "ActivityMediaSubdivisionName"
## [7] "ActivityStartDate"
## [8] "ActivityStartTime.Time"
## [9] "ActivityStartTime.TimeZoneCode"
## [10] "ActivityEndDate"
## [11] "ActivityEndTime.Time"
## [12] "ActivityEndTime.TimeZoneCode"
## [13] "ActivityDepthHeightMeasure.MeasureValue"
## [14] "ActivityDepthHeightMeasure.MeasureUnitCode"
## [15] "ActivityDepthAltitudeReferencePointText"
## [16] "ActivityTopDepthHeightMeasure.MeasureValue"
## [17] "ActivityTopDepthHeightMeasure.MeasureUnitCode"
## [18] "ActivityBottomDepthHeightMeasure.MeasureValue"
## [19] "ActivityBottomDepthHeightMeasure.MeasureUnitCode"
## [20] "ProjectIdentifier"
## [21] "ActivityConductingOrganizationText"
## [22] "MonitoringLocationIdentifier"
## [23] "ActivityCommentText"
## [24] "SampleAquifer"
## [25] "HydrologicCondition"
## [26] "HydrologicEvent"
## [27] "SampleCollectionMethod.MethodIdentifier"
## [28] "SampleCollectionMethod.MethodIdentifierContext"
## [29] "SampleCollectionMethod.MethodName"
## [30] "SampleCollectionEquipmentName"
## [31] "ResultDetectionConditionText"
## [32] "CharacteristicName"
## [33] "ResultSampleFractionText"
## [34] "ResultMeasureValue"
## [35] "ResultMeasure.MeasureUnitCode"
## [36] "MeasureQualifierCode"
## [37] "ResultStatusIdentifier"
## [38] "StatisticalBaseCode"
## [39] "ResultValueTypeName"
## [40] "ResultWeightBasisText"
## [41] "ResultTimeBasisText"
## [42] "ResultTemperatureBasisText"
## [43] "ResultParticleSizeBasisText"
## [44] "PrecisionValue"
## [45] "ResultCommentText"
## [46] "USGSPCode"
## [47] "ResultDepthHeightMeasure.MeasureValue"
## [48] "ResultDepthHeightMeasure.MeasureUnitCode"
## [49] "ResultDepthAltitudeReferencePointText"
## [50] "SubjectTaxonomicName"
## [51] "SampleTissueAnatomyName"
## [52] "ResultAnalyticalMethod.MethodIdentifier"
## [53] "ResultAnalyticalMethod.MethodIdentifierContext"
## [54] "ResultAnalyticalMethod.MethodName"
## [55] "MethodDescriptionText"
## [56] "LaboratoryName"
## [57] "AnalysisStartDate"
## [58] "ResultLaboratoryCommentText"
## [59] "DetectionQuantitationLimitTypeName"
## [60] "DetectionQuantitationLimitMeasure.MeasureValue"
## [61] "DetectionQuantitationLimitMeasure.MeasureUnitCode"
## [62] "PreparationStartDate"
## [63] "ProviderName"
## [64] "timeZoneStart"
## [65] "timeZoneEnd"
## [66] "ActivityStartDateTime"
## [67] "ActivityEndDateTime"
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
Monitoring sites plot
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
Summary of sites by monitoring location type (The descriptive name for a type of monitoring location)
# Summary of sites by type
site_summary <- klamath_sites |>
group_by(MonitoringLocationTypeName) |>
summarise(Site_Count = n(), .groups = "drop")
DT::datatable(site_summary, options = list(pageLength = 10))
Summary: Total number of records per organization
klamath_data_summary <- whatWQPdata(huc = "180102")
# Total number of records per organization
org_summary <- klamath_data_summary |>
group_by(OrganizationFormalName) |>
summarise(Total_Results = sum(resultCount, na.rm = TRUE), .groups = "drop")
DT::datatable(org_summary, options = list(pageLength = 10))
Summary: Monitoring locations with the most data
# Monitoring locations with the most data
site_data_summary <- klamath_data_summary |>
select(MonitoringLocationName, resultCount) |>
arrange(desc(resultCount))
DT::datatable(head(site_data_summary, 10), options = list(pageLength = 10))
# # Get data from the last 10 years (instead of 3)
start_date <- as.character(Sys.Date() - years(10))
# # Select the top 20 sites based on result count
top_sites <- klamath_data_summary |>
arrange(desc(resultCount)) |>
head(20) |>
pull(MonitoringLocationIdentifier)
#
# # Pick the first site to test
# test_site <- top_sites[1]
#
# # Run test query
# test_data <- readWQPdata(
# siteid = test_site,
# startDateLo = start_date
# )
#
# # Check if test site returned data
# if (nrow(test_data) == 0) {
# message("Test site returned no data. Consider changing sites or increasing the timeframe.")
# } else {
# DT::datatable(head(test_data, 10), options = list(pageLength = 10))
# }
#
#
#
# # Retrieve data for the top 20 sites within the last 10 years
# klamath_sample_data <- readWQPdata(
# siteid = top_sites,
# startDateLo = start_date
# )
#
# # Check if data was retrieved successfully
# if (nrow(klamath_sample_data) == 0) {
# message("No data found for the selected sites. Consider further increasing the date range or filtering by parameter.")
# } else {
# DT::datatable(head(klamath_sample_data, 10), options = list(pageLength = 10))
# }
Summary per data provider: showing the top 10 providers with the most data
# Find which providers have the most data
provider_summary <- klamath_data_summary |>
count(ProviderName, sort = TRUE)
DT::datatable(provider_summary, options = list(pageLength = 10))
ggplot(klamath_data_summary, aes(x = MonitoringLocationTypeName, y = resultCount)) +
geom_col(fill = "steelblue") +
labs(title = "Data Availability by Monitoring Location Type",
x = "Monitoring Location Type",
y = "Total Number of Results") +
theme_minimal() +
coord_flip()